Clueing in DC: An Analysis of DC Crime Data from 2018-2022

DSAN 5200 Final Project

Authors
Affiliation

Brian Kwon

Georgetown University

Powell Sheagren

Dheeraj Oruganty

Published

April 29, 2024

Introduction

Figure 1: Comparing Crime Across Cities in 2019

Code
library(rvest)
library(tidyverse)
library(plotly)

# Parse 2019 crime rate data from wikipedia
url = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_crime_rate"
page = read_html(url)
tables = html_table(page, fill = TRUE)
crime_data = tables[[1]]

# Preprocess the dataset
colnames(crime_data) = crime_data[2, ]
crime_data = crime_data[-c(1,2), ]
crime_data = crime_data %>% select(1,2,3,4) # Remove unnecessary columns
colnames(crime_data) = c("state", "city", "population", "crime_rate") # Change column names
crime_data$population = as.numeric(gsub(",", "", crime_data$population)) # Change to numeric
crime_data$crime_rate = as.numeric(crime_data$crime_rate) # Change to numeric

# Leave only one city per state by population
crime_data = crime_data %>%
  group_by(state) %>%
  slice(which.max(population))

# Remove footnote number
crime_data$city = gsub("\\d+$", "", crime_data$city)
crime_data$state =  gsub("\\d+$", "", crime_data$state)

# Change some city name manually for merging
crime_data = crime_data %>%
  mutate(city = if_else(city == "Washington, D.C.", "Washington", city)) %>%
  mutate(city = if_else(city == "Louisville Metro", "Louisville", city))

# Get latitude and longitude data
location = read.csv("./data/uscities.csv")
location = location %>% select(1,4,lat,lng)

# Merge two data sets
df = merge(crime_data, location, by = "city")
df = df %>% 
    filter(state == state_name) %>%
    select(-state_name)

# Color palette
# colors = c("#F1FAEE", "#A8DADC", "#457B9D", "#1D3557")
# colors = c("#ccdbdc","#9ad1d4","#80ced7","#007ea7","#003249")
colors = c("#ccdbdc","#edf8b1", "#7fcdbb", "#2c7fb8")
# colors = c("#caf0f8", "#ade8f4", "#90e0ef", "#48cae4", "#00b4d8", "#0096c7", "#0077b6", "#023e8a", "#03045e") 

# Plot bubble map
map = plot_geo(df, lat = ~lat, lon = ~lng) %>%
  add_markers(
    text = ~paste("State: ", state, "<br>City: ", city, "<br>Crime Rate: ", crime_rate, "<br>Population: ", population), 
    size = ~population, 
    color = ~crime_rate,
    colors = colors,
    opacity = 10000,
    marker = list(sizemode = 'area', sizeref = 0.2, line = list(color = 'black', width = 2))) %>%
    colorbar(title = "Crime Rate") %>%
  layout(title = 'Crime Rate Bubble Map for US cities in 2019', geo = list(scope = 'usa'),
         annotations = list(list(x = 0.8, y = 0.55, text = "Washington D.C.", showarrow = TRUE, xanchor = 'left', yanchor = 'bottom',  ax = 30, ay = 30, font = list(size = 12, color = "red")),
                       list(x = 1, y = 0,  text = "Size by population", showarrow = FALSE, xanchor='right', yanchor='auto', xshift=0, yshift=0, font=list(size=12, color="grey"))))
map

Figure 1: This map represents the relative population and crime rates in the U.S. cities. The size of the dots shows the population of the city and the color represents the amount of crime per 100,000 people.

Figure 2: Crime Proportions in DC From 2018-2022

Code
library(tidyverse)
library(DT)

# Read data files
offense_22 = read.csv("./data/DC-2022/NIBRS_OFFENSE.csv") 
offense_21 = read.csv("./data/DC-2021/NIBRS_OFFENSE.csv") 
offense_20 = read.csv("./data/DC-2020/NIBRS_OFFENSE.csv") 
offense_19 = read.csv("./data/DC-2019/NIBRS_OFFENSE.csv") 
offense_18 = read.csv("./data/DC-2018/NIBRS_OFFENSE.csv") 
offense_code1 = read.csv("./data/DC-2022/NIBRS_OFFENSE_TYPE.csv")
offense_code2 = read.csv("./data/DC-2018/NIBRS_OFFENSE_TYPE.csv")

# Merge with code files for corresponding offense names
offense_22 = merge(offense_22, offense_code1, by = "offense_code")
offense_21 = merge(offense_21, offense_code1, by = "offense_code")
offense_20 = merge(offense_20, offense_code2, by = "OFFENSE_TYPE_ID")
offense_19 = merge(offense_19, offense_code2, by = "OFFENSE_TYPE_ID")
offense_18 = merge(offense_18, offense_code2, by = "OFFENSE_TYPE_ID")

# Calculate the percentage based on the count
offense_22_count = as.data.frame(round(table(offense_22$offense_name)/nrow(offense_22)*100,2))
offense_21_count = as.data.frame(round(table(offense_21$offense_name)/nrow(offense_21)*100,2))
offense_20_count = as.data.frame(round(table(offense_20$OFFENSE_NAME)/nrow(offense_20)*100,2))
offense_19_count = as.data.frame(round(table(offense_19$OFFENSE_NAME)/nrow(offense_19)*100,2))
offense_18_count = as.data.frame(round(table(offense_18$OFFENSE_NAME)/nrow(offense_18)*100,2))

# Merge all years
offense_df = merge(merge(merge(merge(offense_18_count, offense_19_count, by = "Var1", all = TRUE), offense_20_count, by = "Var1", all = TRUE), offense_21_count, by = "Var1", all = TRUE), offense_22_count, by = "Var1", all = TRUE)
colnames(offense_df) = c("Offense Type", "2018", "2019", "2020", "2021", "2022")

# Create datatable
datatable(data = offense_df, caption = "Table", filter = "top")

Figure 2: Data was collected from the 2018-2022 FBI’s National Incident-Based Reporting System. The values are the percentage of a total crime that an individual offense made up.

Figure 3: Relationships Between Offenses Within Incidents

Code
import plotly.graph_objects as go
import numpy as np
import networkx as nx

## Code for this graph generously donated from:
# https://plotly.com/python/network-graphs/

## importing matrix
matrix = np.genfromtxt('./data/adjacency_matrix.csv', delimiter = ",")
# list = np.genfromtxt('Adjacency_list.csv', delimiter = ",")

## Turning adjacency matrix to graph obkect
G = nx.from_numpy_array(matrix,create_using=nx.DiGraph)

## Using a spiral layout to show centrality
pos = nx.spiral_layout(G)

## Adding position based on the layout
for i in range(0,42):
    for g in range(0,42):
        G.nodes[i]['pos'] = pos[i]
        G.nodes[g]['pos'] = pos[g]

## Adding edges together
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

## arranging them into lines
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

## adding nodes to graph
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

## assembly again
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale = ["#E63946", "#F1FAEE", "#A8DADC", "#457B9D", "#1D3557"],
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

## Offenses for tooltip
offenses_list = [ "Destruction/Damage/Vandalism of Property",    "Theft From Motor Vehicle"  ,                 
  "Robbery"                      ,               "Simple Assault"           ,                  
  "Intimidation"                 ,               "All Other Larceny"         ,                 
  "Motor Vehicle Theft"           ,              "Drug Equipment Violations" ,                 
 "Drug/Narcotic Violations"     ,               "Weapon Law Violations"     ,                 
 "Stolen Property Offenses"     ,               "Aggravated Assault"        ,                 
 "Purse-snatching"              ,               "Extortion/Blackmail"         ,               
 "Theft From Building"          ,               "Fondling"                  ,                 
 "Counterfeiting/Forgery"       ,               "Theft of Motor Vehicle Parts or Accessories",
 "Credit Card/Automated Teller Machine Fraud" , "Impersonation"      ,                        
 "Pocket-picking"                     ,         "Kidnapping/Abduction"     ,                  
 "False Pretenses/Swindle/Confidence Game"   ,  "Burglary/Breaking & Entering"       ,        
 "Rape"                      ,                  "Murder and Nonnegligent Manslaughter"   ,    
 "Theft From Coin-Operated Machine or Device" , "Animal Cruelty"          ,                   
 "Shoplifting"             ,                    "Hacking/Computer Invasion"      ,            
 "Identity Theft"         ,                     "Wire Fraud"                ,                 
 "Arson"                 ,                      "Betting/Wagering"         ,                  
 "Welfare Fraud"        ,                       "Pornography/Obscene Material"     ,          
 "Bribery"           ,                          "Purchasing Prostitution"       ,             
 "Prostitution"                 ,               "Sodomy"                 ,                    
 "Sexual Assault With An Object", "Other"]

# getting tooltip
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])) + " | Offense Type: " + offenses_list[node])

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

## Plotting the figure
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Amount of times an Offense is Listed with other Ofenses',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                ))
# fig.update_traces(marker = dict(size = node_adjacencies));
fig.update_traces(marker=dict(size=node_adjacencies, colorbar_title='Interaction Count'));
fig.show()

Figure 3: The network diagram shows data from all years where one incident involved multiple offenses. The lines represent connections between crimes and the size of the nodes mean the amount of times the crime was connected to another.

Figure 4: Offense by Relationship to Victim Heatmap

Code
library(tidyverse)
library(plotly)
library(heatmaply)

## 2018
offense_data_2018 <- read.csv("data/DC-2018/NIBRS_OFFENSE.csv") %>% mutate(year = 2018)
offense_2018 <- read.csv("data/DC-2018/NIBRS_OFFENSE_TYPE.csv")
victim_data_2018 <- read.csv("data/DC-2018/NIBRS_VICTIM.csv") %>% mutate(year = 2018)
relation_2018 <- read.csv("data/DC-2018/NIBRS_VICTIM_OFFENDER_REL.csv")
relationship_2018 <- read.csv("data/DC-2018/NIBRS_RELATIONSHIP.csv")

offense_data_2018 <- left_join(offense_data_2018,offense_2018, by = "OFFENSE_TYPE_ID")
relation_2018 <- left_join(relation_2018,relationship_2018, by = "RELATIONSHIP_ID")
victim_data_2018 <- right_join(victim_data_2018,relation_2018, by = "VICTIM_ID")
total_data_2018 <- left_join(victim_data_2018,offense_data_2018, by = c("INCIDENT_ID","year"))
total_data_2018 <- total_data_2018 %>% select(c(RELATIONSHIP_NAME,OFFENSE_CATEGORY_NAME, year))

## 2019
offense_data_2019 <- read.csv("data/DC-2019/NIBRS_OFFENSE.csv") %>% mutate(year = 2019)
offense_2019 <- read.csv("data/DC-2019/NIBRS_OFFENSE_TYPE.csv")
victim_data_2019 <- read.csv("data/DC-2019/NIBRS_VICTIM.csv") %>% mutate(year = 2019)
relation_2019 <- read.csv("data/DC-2019/NIBRS_VICTIM_OFFENDER_REL.csv")
relationship_2019 <- read.csv("data/DC-2019/NIBRS_RELATIONSHIP.csv")

offense_data_2019 <- left_join(offense_data_2019,offense_2019, by = "OFFENSE_TYPE_ID")
relation_2019 <- left_join(relation_2019,relationship_2019, by = "RELATIONSHIP_ID")
victim_data_2019 <- right_join(victim_data_2019,relation_2019, by = "VICTIM_ID")
total_data_2019 <- left_join(victim_data_2019,offense_data_2019, c("INCIDENT_ID","year"))
total_data_2019 <- total_data_2019 %>% select(c(RELATIONSHIP_NAME,OFFENSE_CATEGORY_NAME, year))

## 2020
offense_data_2020 <- read.csv("data/DC-2020/NIBRS_OFFENSE.csv") %>% mutate(year = 2020)
offense_2020 <- read.csv("data/DC-2020/NIBRS_OFFENSE_TYPE.csv")
victim_data_2020 <- read.csv("data/DC-2020/NIBRS_VICTIM.csv") %>% mutate(year = 2020)
relation_2020 <- read.csv("data/DC-2020/NIBRS_VICTIM_OFFENDER_REL.csv")
relationship_2020 <- read.csv("data/DC-2020/NIBRS_RELATIONSHIP.csv")

offense_data_2020 <- left_join(offense_data_2020,offense_2020, by = "OFFENSE_TYPE_ID")
relation_2020 <- left_join(relation_2020,relationship_2020, by = "RELATIONSHIP_ID")
victim_data_2020 <- right_join(victim_data_2020,relation_2020, by = "VICTIM_ID")
total_data_2020 <- left_join(victim_data_2020,offense_data_2020, by = c("INCIDENT_ID","year"))
total_data_2020 <- total_data_2020 %>% select(c(RELATIONSHIP_NAME,OFFENSE_CATEGORY_NAME, year))

## 2021
offense_data_2021 <- read.csv("data/DC-2021/NIBRS_OFFENSE.csv") %>% mutate(year = 2021)
offense_2021 <- read.csv("data/DC-2021/NIBRS_OFFENSE_TYPE.csv")
victim_data_2021 <- read.csv("data/DC-2021/NIBRS_VICTIM.csv") %>% mutate(year = 2021)
relation_2021 <- read.csv("data/DC-2021/NIBRS_VICTIM_OFFENDER_REL.csv")
relationship_2021 <- read.csv("data/DC-2021/NIBRS_RELATIONSHIP.csv")

offense_data_2021 <- left_join(offense_data_2021,offense_2021, by = "offense_code")
relation_2021 <- left_join(relation_2021,relationship_2021, by = "relationship_id")
victim_data_2021 <- right_join(victim_data_2021,relation_2021, by = "victim_id")
total_data_2021 <- left_join(victim_data_2021,offense_data_2021, by = c("incident_id","year"))
total_data_2021 <- total_data_2021 %>% select(c(relationship_name,offense_category_name, year))

## 2022
offense_data_2022 <- read.csv("data/DC-2022/NIBRS_OFFENSE.csv") %>% mutate(year = 2022)
offense_2022 <- read.csv("data/DC-2022/NIBRS_OFFENSE_TYPE.csv")
victim_data_2022 <- read.csv("data/DC-2022/NIBRS_VICTIM.csv") %>% mutate(year = 2022)
relation_2022 <- read.csv("data/DC-2022/NIBRS_VICTIM_OFFENDER_REL.csv")
relationship_2022 <- read.csv("data/DC-2022/NIBRS_RELATIONSHIP.csv")

offense_data_2022 <- left_join(offense_data_2022,offense_2022, by = "offense_code")
relation_2022 <- left_join(relation_2022,relationship_2022, by = "relationship_id")
victim_data_2022 <- right_join(victim_data_2022,relation_2022, by = "victim_id")
total_data_2022 <- left_join(victim_data_2022,offense_data_2022, by = c("incident_id","year"))
total_data_2022 <- total_data_2022 %>% select(c(relationship_name,offense_category_name, year))

## adjusting colnames for difference
colnames(total_data_2021) <- c("RELATIONSHIP_NAME","OFFENSE_CATEGORY_NAME", "year")
colnames(total_data_2022) <- c("RELATIONSHIP_NAME","OFFENSE_CATEGORY_NAME", "year")

## groups
total_data_relation <- rbind(total_data_2018, total_data_2019, total_data_2020, total_data_2021, total_data_2022)

## relationships store for next chunk
relationships <- total_data_relation$RELATIONSHIP_NAME %>% factor() %>% levels()

## Splitting the relationships type into indicies and then filtering by them
family_relationships_index <- c(6,14,15,16,19,21,22)
partner_relationships_index <- c(1,5,7,8,11,12,23,24,25,26)
acquaintance_relationships_index <- c(3,4,9,10,13,17,18,20)
stranger_relationships_index <- c(27)
other_relationships_index <- c(2)

family_relationships <- relationships[family_relationships_index]
partner_relationships <- relationships[partner_relationships_index]
acquaintance_relationships <- relationships[acquaintance_relationships_index]
stranger_relationships <- relationships[stranger_relationships_index]
other_relationships <- relationships[other_relationships_index]

## Function for new column of values
relation_checker <- function(value){
  if(value %in% family_relationships){
    val <- "Family"
  } else if(value %in% partner_relationships){
    val <- "Partner/Partners Family"
  } else if(value %in% acquaintance_relationships){
    val <- "Acquaintance"
  } else if(value %in% stranger_relationships){
    val <- "Stranger"
  } else{
    val <- "Other"
  }
}

## vectorizing the function and adding the colum
relation_checker <- Vectorize(relation_checker)

total_data_relation <- total_data_relation %>% mutate(Relation_group = relation_checker(RELATIONSHIP_NAME)) %>% filter(Relation_group != "Other")
#total_data_relation$Relation_group %>% table()

## Making matrix for Viz
mat <- total_data_relation %>% group_by(Relation_group,OFFENSE_CATEGORY_NAME) %>% tally() %>%
  spread(Relation_group,n) %>% as.data.frame()
mat[is.na(mat)] <- 0
rownames(mat) <- mat$OFFENSE_CATEGORY_NAME
mat <- mat %>% select(-OFFENSE_CATEGORY_NAME)

# Color palette
colors = c("#F1FAEE", "#A8DADC", "#457B9D", "#1D3557")
# colors = c("#ccdbdc","#edf8b1", "#7fcdbb", "#2c7fb8")

## Heatmap code
ptotal <- heatmaply(mat,
                    label_names = c("Crime Group", "Relation", "Relation Prevelance"),
                    colors = colors,
                    # width  = 800, 
                    height = 600,
                    dendrogram = FALSE,
                    # limits = c(0,10000),
                    scale = "row",
                    branches_lwd = 0.1,
                    # hide_colorbar = TRUE,
                    grid_color = "white",
                    grid_width = 0.00001,
                    dend_hoverinfo = FALSE,
                    main = "Heatmap of offense category by relationship between victim and offender")
ptotal

Fig 4: This heat map shows the amount, by color, of each offense category scaled by each row and what the relationship of the victim was to the offender for the amount.

Figure 5: Interactive Location of Offenses in 2018, 2020, 2022

Figure 5: Interactive Visualization of Top 15 Crime Incidents by Location in Washington D.C. for the Years 2018, 2020, and 2022

Figure 6: Sankey Diagram of Weapon Type and Injury by Offense

Code
library(tidyverse)
library(networkD3)
library(htmlwidgets)
library(htmltools)

# Read all necessary files
offense_18 = read.csv("./data/DC-2018/NIBRS_OFFENSE.csv") 
offender_18 = read.csv("./data/DC-2018/NIBRS_OFFENDER.csv") 
victim_18 = read.csv("./data/DC-2018/NIBRS_VICTIM.csv")
weapon_18 = read.csv("./data/DC-2018/NIBRS_WEAPON.csv") 
injury_18 = read.csv("./data/DC-2018/NIBRS_VICTIM_INJURY.csv")

# Select offense_id, incident_id, offender_id, victim_id, offense_code, injury_id, weapon_id
offense_18 = offense_18 %>% select(2,3,4)
offender_18 = offender_18 %>% select(2,3)
victim_18 = victim_18 %>% select(2,3)
weapon_18 = weapon_18 %>% select(2,3)
injury_18 = injury_18 %>% select(2,3)

# Read codes files for nodes
offense_code = read.csv("./data/DC-2018/NIBRS_OFFENSE_TYPE.csv")
injury_code = read.csv("./data/DC-2018/NIBRS_INJURY.csv")
weapon_code = read.csv("./data/DC-2018/NIBRS_WEAPON_TYPE.csv")

# Get offense_code, offense_type_id, offense_name
offense_code = offense_code %>% select(1,2,3)
# Change offense_type_id to offense_code
offense_18 = merge(offense_18, offense_code, by = "OFFENSE_TYPE_ID")
offense_18 = offense_18 %>% select(2,3,4)
# Merge by incident_id, offense_id, victim_id
df_18 = merge(merge(merge(merge(offense_18, offender_18, by = "INCIDENT_ID"), victim_18, by = "INCIDENT_ID"), injury_18, by = "VICTIM_ID"), weapon_18, by = "OFFENSE_ID")
# Remove incident_id, offense_id, victim_id, offender_id
df_18 = df_18 %>% select(-1,-2,-3,-5)

# # Make column names to lower case
colnames(df_18) = tolower(colnames(df_18))

# Paste character to make ids unique
df_18$injury_id = paste0("i", df_18$injury_id)
df_18$weapon_id = paste0("w", df_18$weapon_id)

# # Count the unique combinations of offense types and weapon types and subset if there are more than 100 cases
first_link = df_18 %>%
    group_by(offense_code, weapon_id) %>%
    summarise(value = n(), .groups = "drop") %>%
    arrange(desc(value)) %>%
    rename(source = offense_code, target = weapon_id) %>%
    filter(value > 100)
# # Count the unique combinations of weapon types and injury types and subset if there are more than 100 cases
second_link = df_18 %>%
    group_by(weapon_id, injury_id) %>%
    summarise(value = n(), .groups = "drop") %>%
    arrange(desc(value)) %>%
    rename(source = weapon_id, target = injury_id) %>%
    filter(value > 100)
# # Combine those two links
links.df = as.data.frame(rbind(first_link,second_link))

# Get the codes and names
offense_code = offense_code %>%
    select(2,3) %>%
    rename(name = OFFENSE_CODE, label = OFFENSE_NAME)
injury_code = injury_code %>% 
    select(1,3) %>%
    rename(name = INJURY_ID, label = INJURY_NAME)
weapon_code = weapon_code %>% 
    select(1,3) %>%
    rename(name = WEAPON_ID, label = WEAPON_NAME)

# Make codes unique
injury_code$name = paste0("i", injury_code$name)
weapon_code$name = paste0("w", weapon_code$name)
# Combine all the nodes
nodes.df = rbind(offense_code, injury_code, weapon_code)
# Subset only nodes from the links
nodes.df = nodes.df %>% filter(name %in% c(unique(first_link$source),unique(first_link$target),unique(second_link$target)))

# Create source_id and target_id for a sankey diagram
links.df$source_id = match(links.df$source, nodes.df$name) - 1 
links.df$target_id = match(links.df$target, nodes.df$name) - 1 

# Color Palette
my_color = 'd3.scaleOrdinal().range(["#ccdbdc", "#edf8b1", "#7fcdbb", "#2c7fb8"])'

# Create a sankey diagram
net = sankeyNetwork(Links = links.df,     
              Nodes = nodes.df,     
              Source = 'source_id', 
              Target = 'target_id', 
              Value = 'value',     
              NodeID = 'label',      
              fontSize = 16,        
              colourScale=my_color, 
              iterations = 0)

# Add a title
net_with_title = prependContent(net, tags$b(HTML('Injuries and weapon type by offense type in 2018')))
net_with_title
Injuries and weapon type by offense type in 2018
Code
library(tidyverse)
library(networkD3)
library(htmlwidgets)
library(htmltools)

# Read all necessary files
offense_22 = read.csv("./data/DC-2022/NIBRS_OFFENSE.csv") 
offender_22 = read.csv("./data/DC-2022/NIBRS_OFFENDER.csv") 
victim_22 = read.csv("./data/DC-2022/NIBRS_VICTIM.csv")
weapon_22 = read.csv("./data/DC-2022/NIBRS_WEAPON.csv") 
injury_22 = read.csv("./data/DC-2022/NIBRS_VICTIM_INJURY.csv")


# Select offense_id, incident_id, offender_id, victim_id, offense_code, injury_id, weapon_id
offense_22 = offense_22 %>% select(2,3,4)
offender_22 = offender_22 %>% select(2,3)
victim_22 = victim_22 %>% select(2,3)
weapon_22 = weapon_22 %>% select(2,3)
injury_22 = injury_22 %>% select(2,3)

# Merge by incident_id, offense_id, victim_id
df_22 = merge(merge(merge(merge(offense_22, offender_22, by = "incident_id"), victim_22, by = "incident_id"), injury_22, by = "victim_id"), weapon_22, by = "offense_id")
# Remove incident_id, offense_id, victim_id, offender_id
df_22 = df_22 %>% select(-1,-2,-3,-5)

# Paste character to make ids unique
df_22$injury_id = paste0("i", df_22$injury_id)
df_22$weapon_id = paste0("w", df_22$weapon_id)

# Count the unique combinations of offense types and weapon types and subset if there are more than 100 cases
first_link = df_22 %>%
    group_by(offense_code, weapon_id) %>%
    summarise(value = n(), .groups = "drop") %>%
    arrange(desc(value)) %>%
    rename(source = offense_code, target = weapon_id) %>%
    filter(value > 100)
# Count the unique combinations of weapon types and injury types and subset if there are more than 100 cases
second_link = df_22 %>%
    group_by(weapon_id, injury_id) %>%
    summarise(value = n(), .groups = "drop") %>%
    arrange(desc(value)) %>%
    rename(source = weapon_id, target = injury_id) %>%
    filter(value > 100)
# Combine those two links
links.df = as.data.frame(rbind(first_link,second_link))

# Read codes files for nodes
offense_code = read.csv("./data/DC-2022/NIBRS_OFFENSE_TYPE.csv")
injury_code = read.csv("./data/DC-2022/NIBRS_INJURY.csv")
weapon_code = read.csv("./data/DC-2022/NIBRS_WEAPON_TYPE.csv")

# Get the codes and names
offense_code = offense_code %>% 
    select(1,2) %>%
    rename(name = offense_code, label = offense_name)
injury_code = injury_code %>% 
    select(1,3) %>%
    rename(name = injury_id, label = injury_name)
weapon_code = weapon_code %>% 
    select(1,3) %>%
    rename(name = weapon_id, label = weapon_name)

# Make codes unique
injury_code$name = paste0("i", injury_code$name)
weapon_code$name = paste0("w", weapon_code$name)
# Combine all the nodes
nodes.df = rbind(offense_code, injury_code, weapon_code)
# Subset only nodes from the links
nodes.df = nodes.df %>% filter(name %in% c(unique(first_link$source),unique(first_link$target),unique(second_link$target)))

# Create source_id and target_id for a sankey diagram
links.df$source_id = match(links.df$source, nodes.df$name) - 1 
links.df$target_id = match(links.df$target, nodes.df$name) - 1 

# Color Palette
my_color = 'd3.scaleOrdinal().range(["#F1FAEE", "#A8DADC", "#457B9D", "#1D3557"])'

# Color groupings
# nodes.df = nodes.df %>%
#   mutate(group = ifelse(name == "13B", "a",
#                           ifelse(name == "13A", "b", 
#                                 ifelse(name == "120", "c", "g")))) %>%
#   mutate(group = ifelse(name == "w41", "e", "g")) %>%
#   mutate(group = ifelse(name == "i4", "f", "g"))

# Create a sankey diagram
net = sankeyNetwork(Links = links.df,     
              Nodes = nodes.df,     
              Source = 'source_id', 
              Target = 'target_id', 
              Value = 'value',     
              NodeID = 'label',      
              fontSize = 16,    
              colourScale=my_color,    
              iterations = 0)

# Add a title
net_with_title = prependContent(net, tags$b(HTML('Injuries and weapon type by offense type in 2022')))
net_with_title
Injuries and weapon type by offense type in 2022

Figure 6: This sankey diagram shows the offenses, the weapons used, and the amount of injury caused in 2018 and 2022. The paths between the values show the flow of this amount.

Conclusion